We will run GEDI on the pbmc data from https://doi.org/10.1038/s41587-020-0534-z
library(scran)
library(scater)
library(tictoc)
library(Matrix)
library(uwot)
source("gedi/scIntegration.v98.svdC.R") # load GEDI functions
sce<- readRDS("sce.rds")
var_genes<- readRDS("variable_genes.rds")
sce$log10_sum<- log10(sce$sum) ## adding variable of log10(totalcounts)
raw_counts<- assay(sce, "counts")
table(sce$Sample)
10x Chromium (v2) A 10x Chromium (v2) B 10x Chromium (v3) CEL-Seq2
3222 3222 3222 253
Drop-seq inDrops Seq-Well Smart-seq2
3222 3222 3222 253
Filtering out low expressed genes
## Filtering low expressed genes ( at least 3 cells express more than 1 counts)
sum_genes<- rowSums(raw_counts>5)
genes_use<- sum_genes>3
table(genes_use)
genes_use
FALSE TRUE
22160 11534
sce<- sce[genes_use,]
sce
class: SingleCellExperiment
dim: 11534 19838
metadata(0):
assays(2): counts logcounts
rownames(11534): DPM1 SCYL3 ... CTD-2060L22.1 RP11-107E5.4
rowData names(0):
colnames(19838): pbmc1_SM2_Cell_108 pbmc1_SM2_Cell_115 ...
pbmc1_inDrops_TGTTATCA-GGAGGTAA-TAAATAGG
pbmc1_inDrops_TGTTATCA-GGAGGTAA-TGGTATGA
colData names(20): orig.ident nCount_RNA ... sizeFactor log10_sum
reducedDimNames(0):
altExpNames(0):
This step is optional, but it augments performance and reduces optimization time.
var_genes<- intersect(var_genes, rownames(sce) )
raw_counts<- raw_counts[var_genes,]
c_mat<- readRDS("C_matrices/CellMarker.rds")
Setting up C matrix
commongenes<- intersect(rownames(c_mat), rownames(raw_counts) )
c_mat<- c_mat[commongenes,]
raw_counts<- raw_counts[commongenes,]
# remove any columns that results in singularity
QR <- qr(crossprod(c_mat))
c_mat <- c_mat[ , QR$pivot[seq_len(QR$rank)] ]
dim(colData(sce))
[1] 19838 20
dim(c_mat)
[1] 3011 173
dim(raw_counts)
[1] 3011 19838
Main arguments for running GEDI are:
Optional arguments:
For this run, we are going to use GEDI with the raw counts, Bsphere mode and a matrix of cell type markers ( from cellmarker database) as prior biological information.
K<- 100
itelim <- 150
mode <- "Bsphere"
oi_shrinkage<- 0.001
model <- new("GEDI") # Initialize GEDI object
model$setup( Samples = sce$Sample, M = raw_counts, C=c_mat, mode = mode, K = K, oi_shrinkage=oi_shrinkage ) # set up parameters
model$initialize.LVs(randomSeed = 1) # initialize LVs
tic("Optimization")
model$optimize(itelim) # run model
toc()
We can also resume optimization if needed.
itelim<- 350
tic("resuming optimization")
model$optimize(itelim)
toc()
Saving model
saveRDS(model, "model.rds")
Loading model
model<- readRDS('model.rds')
model$plotTracking()
$ZAo
$Bi
$Qi
$oi
$si
$Ri
NULL
$sigma2
# Return the SVD projection of the integrated data
# This function returns a K x N matrix, with K being the number of latent variables and N being the number of cells
model_svd<- model$svd.projection()
# umap
set.seed(43)
umap_obj <- umap( t(model_svd), a=7, b=0.8, min_dist=0.01, metric="euclidean")
colnames(umap_obj)<- paste0("umap", 1:2)
rownames(umap_obj)<- colnames(sce)
reducedDim(sce, "umap")<- umap_obj ## Adding umap coordinates to sce object
for(var_plot in c("Sample", "CellType", "log10_sum", "subsets_mito_percent") ){
print( plot_dimred(sce, var_plot, size_dot=0.05) )
}
Loading required package: RColorBrewer
Loading required package: ggrastr
Loading required package: viridis
Loading required package: viridisLite
Attaching package: 'viridis'
The following object is masked from 'package:scales':
viridis_pal
Measuring performance of integration
alignment_score(t(umap_obj), data.frame(colData(sce)), var_use="Sample")
Loading required package: BiocNeighbors
[1] 0.9677019
1- alignment_score(t(umap_obj), data.frame(colData(sce)), var_use="CellType")
[1] 0.7586957
We observe a high alignment score ( good batch correction) and good values for the cell type conservation.
A_lv<- model$getA()
A_cell<- model$getADB()
Visualizing activities per latent variable
pheatmap.colorsymmetric(A_lv, fontsize_row=3, fontsize_col=5)
Loading required package: pheatmap
This heatmap associates each latent variable to a given gene set from our C matrix. So, we could identify the main lvs driving variability in our dataset.
Other approach that is also helpful is to look at the activities per cell. Using these, we can find if any of our cell populations of interest show differential activity patterns. In this case, we will use the 'CellType' category, but we could use any label, e.g. derive clusters, etc.
sce_A<- SingleCellExperiment( list(logcounts=A_cell), colData=colData(sce))
# Doing a t test looking for upregulated markers for each population ( cell types in this case)
markers_sce <- findMarkers(sce_A, group=colData(sce)[,"CellType"], test.type="t", pval.type="all", direction="up")
lis_markers<- summary_markers(markers_sce)
markers_fdr<- do.call(cbind, lapply(lis_markers, function(X) X[,"neglog_FDR",drop=FALSE] ) )
colnames(markers_fdr)<- names(lis_markers)
markers_fdr
Plotting -log10 FDR values for association of activity per each celltype Filtered heatmap with only rows with at least 1 significant association
filt<- markers_fdr[apply(markers_fdr, 1, max) > -log10(0.05),]
pheatmap.colorsymmetric(filt)
Visualizing the highest association for each cell type
colData(sce)<- cbind(colData(sce), t(A_cell)[colnames(sce),] )
for( celltype in colnames(markers_fdr) ){
cat("Celltype:", celltype, "\n")
sig_markers<- markers_fdr[markers_fdr[,celltype] > -log10(0.05),]
sig_markers<- sig_markers[order(-sig_markers[,celltype]),]
if( nrow(sig_markers) > 0 ){
for ( var_look in head(rownames(sig_markers),1) ){
print( plot_dimred(sce, var_look, size_dot=0.05, center=TRUE) )
print(plotExpression(sce_A,
features = var_look,
x = "CellType",
exprs_values = "logcounts",
colour_by = "CellType",
point_size = 0.1) +
labs(title=celltype, y="activity" ) +
fun_theme_plot()
)
}
}
}
Celltype: B cell
Celltype: CD14+ monocyte
Celltype: CD16+ monocyte
Celltype: CD4+ T cell
Celltype: Cytotoxic T cell
Celltype: Dendritic cell
Celltype: Megakaryocyte
Celltype: Natural killer cell
Celltype: Plasmacytoid dendritic cell
Celltype: Unassigned
We observe relatively good annotation using this C matrix, with good predicted activities for Bcells, CD14+ monocytes, NK cells and Plasmacytoid dendritic cells.
Perform imputation given the raw counts.
# This function returns a J x N matrix, with J being the number of genes and N being the number of cells
# Residual of Yi after taking into account everything other than ZDBi
# The effect of sample-specific distortions and cell-specific library sizes is removed
imputedY<- model$imputeY()
sce_impute<- SingleCellExperiment(list(imputedY=imputedY), colData=colData(sce) )
vec_genes<- c( "IL7R")
for( gene_look in vec_genes) {
print( plotExpression(sce,
features = gene_look,
x = "CellType",
exprs_values = "logcounts",
colour_by = "CellType",
point_size = 0.1)+
fun_theme_plot()
)
print( plotExpression(sce_impute,
features = gene_look,
x = "CellType",
exprs_values = "imputedY",
colour_by = "CellType",
point_size = 1) +
fun_theme_plot()
)
}
sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2020/Core/imkl/2020.1.217/compilers_and_libraries_2020.1.217/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] pheatmap_1.0.12 BiocNeighbors_1.8.2
[3] viridis_0.5.1 viridisLite_0.3.0
[5] ggrastr_0.2.3 RColorBrewer_1.1-2
[7] scales_1.1.1 corpcor_1.6.9
[9] rsvd_1.0.5 ClusterR_1.2.5
[11] gtools_3.8.2 quadprog_1.5-8
[13] Rcpp_1.0.6 uwot_0.1.10
[15] Matrix_1.3-3 tictoc_1.0
[17] scater_1.18.6 ggplot2_3.3.3
[19] scran_1.18.6 SingleCellExperiment_1.12.0
[21] SummarizedExperiment_1.20.0 Biobase_2.50.0
[23] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
[25] IRanges_2.24.1 S4Vectors_0.28.1
[27] BiocGenerics_0.36.0 MatrixGenerics_1.2.1
[29] matrixStats_0.58.0
loaded via a namespace (and not attached):
[1] bitops_1.0-6 RcppAnnoy_0.0.18
[3] tools_4.0.0 bslib_0.2.4
[5] utf8_1.2.1 R6_2.5.0
[7] irlba_2.3.3 vipor_0.4.5
[9] DBI_1.1.1 colorspace_2.0-0
[11] withr_2.4.1 tidyselect_1.1.0
[13] gridExtra_2.3 compiler_4.0.0
[15] DelayedArray_0.16.3 labeling_0.4.2
[17] sass_0.3.1 stringr_1.4.0
[19] digest_0.6.27 rmarkdown_2.7
[21] XVector_0.30.0 pkgconfig_2.0.3
[23] htmltools_0.5.1.1 sparseMatrixStats_1.2.1
[25] highr_0.8 limma_3.46.0
[27] rlang_0.4.10 DelayedMatrixStats_1.12.3
[29] farver_2.1.0 jquerylib_0.1.3
[31] generics_0.1.0 jsonlite_1.7.2
[33] BiocParallel_1.24.1 dplyr_1.0.5
[35] RCurl_1.98-1.3 magrittr_2.0.1
[37] BiocSingular_1.6.0 GenomeInfoDbData_1.2.4
[39] scuttle_1.0.4 ggbeeswarm_0.6.0
[41] munsell_0.5.0 fansi_0.4.2
[43] lifecycle_1.0.0 stringi_1.5.3
[45] yaml_2.2.1 edgeR_3.32.1
[47] zlibbioc_1.36.0 grid_4.0.0
[49] dqrng_0.2.1 crayon_1.4.1
[51] lattice_0.20-41 cowplot_1.1.1
[53] beachmat_2.6.4 locfit_1.5-9.4
[55] knitr_1.31 pillar_1.5.1
[57] igraph_1.2.6 codetools_0.2-16
[59] glue_1.4.2 evaluate_0.14
[61] vctrs_0.3.7 gtable_0.3.0
[63] purrr_0.3.4 assertthat_0.2.1
[65] xfun_0.22 RcppEigen_0.3.3.9.1
[67] RSpectra_0.16-0 tibble_3.1.0
[69] beeswarm_0.3.1 gmp_0.6-2
[71] bluster_1.0.0 statmod_1.4.35
[73] ellipsis_0.3.1